# -*- coding: utf-8 -*-
"""
Code Introduction:
    This code 
Version History:
    Created: Mon Oct  5 14:03:46 2020
    Current: 

@author: Xing Guo (guoxing.econ@gmail.com)

"""

#%% Setup Working Directory

import os
## Windows System Path
FolderList = [xx+"Dropbox (Bank of Canada)\\Research Projects\\OHANK\\Empirics\\" \
              for xx in ["E:\\","B:\\","/mnt/b/"]]
for Folder in FolderList:
    if os.path.exists(Folder):
        os.chdir(Folder)    

## Output Folder
OutputFolder = 'TableGraph/'
if not os.path.exists(OutputFolder):
    os.makedirs(OutputFolder)
# End of Section: Setup Working Directory
###############################################################################

#%% Import Moduels

## System Tools

import numpy as np
from collections import OrderedDict
import time
## I/O Tools
import _pickle as pickle
## Data Process Tools
import pandas as pd
# import modin.pandas as pd
import datetime
## Graphs
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf as figpdf
## Statistical Tools
import statsmodels.formula.api as sm
from statsmodels.tsa.api import VAR
from scipy.stats import mstats
from scipy.interpolate import interp1d
import statsmodels.api as SMAPI
from statsmodels.tsa.tsatools import detrend as DeTrend
from statsmodels.tsa.filters.hp_filter import hpfilter as HPfilter
from statsmodels.tsa.filters.bk_filter import bkfilter as BKfilter
## Database API
from fredapi import Fred
## Numerical API
from scipy.interpolate import interp1d
## Regular Expression API
import re
import multiprocessing as mp

idx = pd.IndexSlice

# from Toolbox_Graph import Graph_PDF, GraphSetup, MultiLine_SinglePlot, NBER_RecessionBar, IRF_SinglePlot
# End of Section: Import Moduels
###############################################################################

#%% Cleaning

### Generate the Sample

## Data
SFS_2016 = pd.read_stata('Data/SFS/sfs2016en.dta')

## Variables
VarInfo = pd.read_excel('Data/SFS/VarDict.xlsx')
VarDict = {VarInfo.loc[ii,'Code'].lower(): VarInfo.loc[ii,'Name'] for ii in VarInfo.index}

## Sample
DS = SFS_2016.rename(columns=VarDict)[VarDict.values()]

### Functions to Compute the Distributional Features
def QW_ApproxPoints(qq,qw=[]):
    TempInd = ~pd.isnull(qq)
    if len(qw)==0:
        qw = np.ones(len(qq))/len(qq)
    else:
        qw = qw[TempInd].values
        qw = qw/sum(qw)
    
    qq = qq[TempInd].values
    
    OrderInd = np.argsort(qq)
    qq = qq[OrderInd]
    qw = qw[OrderInd]
    return qq, qw

def QW_Quantile(xx,qq,qw=[]):
    qq, qw = QW_ApproxPoints(qq,qw)
    cdf = np.cumsum(qw)
    ff = interp1d(cdf,qq,kind='cubic')
    return ff(xx)

def Dist_byQuantile(DS,qq_name,qw_name='',xx=[0.25,0.50,0.75]):
    qq = DS[qq_name]
    if qw_name=='':
        qw = qq.copy()
        qw = 1
    else:
        qw = DS[qw_name]
        
    quant = QW_Quantile(xx,qq,qw=qw)
    qq, qw = QW_ApproxPoints(qq,qw)
    qq_qw = qq * qw
    cdf = np.cumsum(qw)
    
    AccQQ_up = np.zeros(len(xx))
    AccQW_up = np.zeros(len(xx))
    Total = sum(qq_qw)
    for ii in range(len(xx)):
        TempInd = cdf>=xx[ii]
        AccQQ_up[ii] = sum(qq_qw[TempInd])/Total
        AccQW_up[ii] = sum(qw[TempInd])
    Dist = pd.concat([pd.Series(AccQW_up,name='AccQW'), \
                      pd.Series(AccQQ_up,name='AccQQ_up'), \
                      pd.Series(quant,name='QQ'),pd.Series(xx,name='Quant')],axis=1)
    Dist['AccQQ_down'] = 1-Dist['AccQQ_up']
    return Dist

def Dist_byHistogram(DS,qq_name,qw_name,bins=50):
    TempDS = DS[[qq_name,qw_name]].dropna()
    if type(bins)==int:
        cutoff = np.linspace(DS[qq_name].min(),DS[qq_name].max(),num=bins-1)
        num = bins
    elif type(bins)==list: 
        cutoff = bins
        num = len(bins)+1
    
    cutoffbins = [-np.inf,*cutoff,np.inf]
    TempDS['Group'] = pd.cut(TempDS[qq_name],bins=cutoffbins,labels=range(num))
    TempDS['QQ'] = TempDS[qq_name]
    TempDS['QW'] = TempDS[qw_name]/TempDS[qw_name].sum()
    TempDS['QQ_QW'] = TempDS['QQ']*TempDS['QW']
    Hist = TempDS.groupby('Group')[['QW','QQ_QW']].sum()
    Hist['QQ_left'] = pd.Series(cutoffbins[0:-1])
    Hist['QQ_right'] = pd.Series(cutoffbins[1:])
    Hist['QQ_mid'] = (Hist['QQ_left']+Hist['QQ_right'])/2
    Hist.loc[0,'QQ_mid'] = Hist.loc[0,'QQ_right']
    Hist.loc[num-1,'QQ_mid'] = Hist.loc[num-1,'QQ_left']
    
    return Hist

def Dist_GiniCoef(DS,qq_name,qw_name):
    TempDS = DS[[qq_name,qw_name]].sort_values(by=qq_name).reset_index(drop=True)
    TempDS['QW'] = TempDS[qw_name]/TempDS[qw_name].sum()
    TempDS['QQ_QW'] = TempDS[qq_name]*TempDS['QW']
    TempDS['QQ_QW'] = TempDS['QQ_QW']/TempDS['QQ_QW'].sum()
    TempDS['AccProb'] = TempDS['QW'].cumsum()
    TempDS['AccQQ'] = TempDS['QQ_QW'].cumsum()
    TempDS['AccProb_'] = (TempDS['AccProb']+TempDS['AccProb'].shift(1))/2
    TempDS.loc[0,'AccProb_'] = TempDS.loc[0,'AccProb']/2
    TempDS['AccQQ_'] = (TempDS['AccQQ']+TempDS['AccQQ'].shift(1))/2
    TempDS.loc[0,'AccQQ_'] = TempDS.loc[0,'AccQQ']/2
    Gini = (TempDS['AccQQ_']*TempDS['QW']).sum()/ \
           (TempDS['AccProb_']*TempDS['QW']).sum()
    return Gini
### Distribution of Income
Var_Income = 'Income_Market'
DS[Var_Income] = DS[Var_Income].astype('float')

## Summary Statistics
QuantList = [0.01,0.05,0.10,0.25,0.5,0.75,0.90,0.95,0.99]
Income_SumStat = DS[Var_Income].describe(percentiles=QuantList)

## Detailed Quantiles
QuantList = list(np.linspace(0.01,0.99,num=99))
IncomeDist_byQuant = Dist_byQuantile(DS,Var_Income,'StatWeight',QuantList)

## Detailed Histogram
HistList = list(np.linspace(0,36*1e4,37))
IncomeDist_byHist = Dist_byHistogram(DS,Var_Income,'StatWeight',bins=HistList)


### Distribution of Net Worth
Var_Wealth = 'NetWorth_G'

## Summary Statistics
QuantList = [0.01,0.05,0.10,0.25,0.5,0.75,0.90,0.95,0.99]
Wealth_SumStat = DS[Var_Wealth].describe(percentiles=QuantList)

## by Quantiles
QuantList = list(np.linspace(0.01,0.99,num=99))
WealthDist_byQuant = Dist_byQuantile(DS,Var_Wealth,'StatWeight',QuantList)

## by Histogram Grids
HistList = [*list(np.linspace(-3*1e4,0-1e3,4)),*list(np.linspace(1e3,700*1e4,701))]
WealthDist_byHist = Dist_byHistogram(DS,Var_Wealth,'StatWeight',bins=HistList)
# End of Section:
###############################################################################